{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exercise 1: Schema on Read"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "import pandas as pd\n",
    "import matplotlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession.builder.getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|               value|\n",
      "+--------------------+\n",
      "|199.72.81.55 - - ...|\n",
      "|unicomp6.unicomp....|\n",
      "|199.120.110.21 - ...|\n",
      "|burger.letters.co...|\n",
      "|199.120.110.21 - ...|\n",
      "+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dfLog = spark.read.text(\"data/NASA_access_log_Jul95.gz\")\n",
    "dfLog.limit(5).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|               value|\n",
      "+--------------------+\n",
      "|199.72.81.55 - - ...|\n",
      "|unicomp6.unicomp....|\n",
      "|199.120.110.21 - ...|\n",
      "|burger.letters.co...|\n",
      "|199.120.110.21 - ...|\n",
      "+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#Data Source: http://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz\n",
    "dfLog = spark.read.text(\"data/NASA_access_log_Jul95.gz\")\n",
    "dfLog.limit(5).show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Quick inspection of  the data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- value: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# see the schema\n",
    "dfLog.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1891715"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# number of lines\n",
    "dfLog.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|               value|\n",
      "+--------------------+\n",
      "|199.72.81.55 - - ...|\n",
      "|unicomp6.unicomp....|\n",
      "|199.120.110.21 - ...|\n",
      "|burger.letters.co...|\n",
      "|199.120.110.21 - ...|\n",
      "+--------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#what's in there? \n",
    "dfLog.show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----------------------------------------------------------------------------------------------------------------------+\n",
      "|value                                                                                                                  |\n",
      "+-----------------------------------------------------------------------------------------------------------------------+\n",
      "|199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] \"GET /history/apollo/ HTTP/1.0\" 200 6245                                 |\n",
      "|unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985                      |\n",
      "|199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] \"GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0\" 200 4085   |\n",
      "|burger.letters.com - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/countdown/liftoff.html HTTP/1.0\" 304 0               |\n",
      "|199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0\" 200 4179|\n",
      "+-----------------------------------------------------------------------------------------------------------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#a better show?\n",
    "dfLog.show(5, truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] \"GET /history/apollo/ HTTP/1.0\" 200 6245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] \"GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0\" 200 4085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/countdown/liftoff.html HTTP/1.0\" 304 0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0\" 200 4179</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                     value\n",
       "0                                   199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] \"GET /history/apollo/ HTTP/1.0\" 200 6245\n",
       "1                        unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985\n",
       "2     199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] \"GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0\" 200 4085\n",
       "3                 burger.letters.com - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/countdown/liftoff.html HTTP/1.0\" 304 0\n",
       "4  199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0\" 200 4179"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#pandas to the rescue\n",
    "pd.set_option('max_colwidth', 200)\n",
    "dfLog.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Let' try simple parsing with split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>value</th>\n",
       "      <th>tokenized</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] \"GET /history/apollo/ HTTP/1.0\" 200 6245</td>\n",
       "      <td>[199.72.81.55, -, -, [01/Jul/1995:00:00:01, -0400], \"GET, /history/apollo/, HTTP/1.0\", 200, 6245]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985</td>\n",
       "      <td>[unicomp6.unicomp.net, -, -, [01/Jul/1995:00:00:06, -0400], \"GET, /shuttle/countdown/, HTTP/1.0\", 200, 3985]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] \"GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0\" 200 4085</td>\n",
       "      <td>[199.120.110.21, -, -, [01/Jul/1995:00:00:09, -0400], \"GET, /shuttle/missions/sts-73/mission-sts-73.html, HTTP/1.0\", 200, 4085]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/countdown/liftoff.html HTTP/1.0\" 304 0</td>\n",
       "      <td>[burger.letters.com, -, -, [01/Jul/1995:00:00:11, -0400], \"GET, /shuttle/countdown/liftoff.html, HTTP/1.0\", 304, 0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0\" 200 4179</td>\n",
       "      <td>[199.120.110.21, -, -, [01/Jul/1995:00:00:11, -0400], \"GET, /shuttle/missions/sts-73/sts-73-patch-small.gif, HTTP/1.0\", 200, 4179]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /images/NASA-logosmall.gif HTTP/1.0\" 304 0</td>\n",
       "      <td>[burger.letters.com, -, -, [01/Jul/1995:00:00:12, -0400], \"GET, /images/NASA-logosmall.gif, HTTP/1.0\", 304, 0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/video/livevideo.gif HTTP/1.0\" 200 0</td>\n",
       "      <td>[burger.letters.com, -, -, [01/Jul/1995:00:00:12, -0400], \"GET, /shuttle/countdown/video/livevideo.gif, HTTP/1.0\", 200, 0]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/countdown.html HTTP/1.0\" 200 3985</td>\n",
       "      <td>[205.212.115.106, -, -, [01/Jul/1995:00:00:12, -0400], \"GET, /shuttle/countdown/countdown.html, HTTP/1.0\", 200, 3985]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>d104.aa.net - - [01/Jul/1995:00:00:13 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985</td>\n",
       "      <td>[d104.aa.net, -, -, [01/Jul/1995:00:00:13, -0400], \"GET, /shuttle/countdown/, HTTP/1.0\", 200, 3985]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] \"GET / HTTP/1.0\" 200 7074</td>\n",
       "      <td>[129.94.144.152, -, -, [01/Jul/1995:00:00:13, -0400], \"GET, /, HTTP/1.0\", 200, 7074]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                     value  \\\n",
       "0                                   199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] \"GET /history/apollo/ HTTP/1.0\" 200 6245   \n",
       "1                        unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985   \n",
       "2     199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] \"GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0\" 200 4085   \n",
       "3                 burger.letters.com - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/countdown/liftoff.html HTTP/1.0\" 304 0   \n",
       "4  199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0\" 200 4179   \n",
       "5                      burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /images/NASA-logosmall.gif HTTP/1.0\" 304 0   \n",
       "6          burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/video/livevideo.gif HTTP/1.0\" 200 0   \n",
       "7               205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/countdown.html HTTP/1.0\" 200 3985   \n",
       "8                                 d104.aa.net - - [01/Jul/1995:00:00:13 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985   \n",
       "9                                                129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] \"GET / HTTP/1.0\" 200 7074   \n",
       "\n",
       "                                                                                                                            tokenized  \n",
       "0                                   [199.72.81.55, -, -, [01/Jul/1995:00:00:01, -0400], \"GET, /history/apollo/, HTTP/1.0\", 200, 6245]  \n",
       "1                        [unicomp6.unicomp.net, -, -, [01/Jul/1995:00:00:06, -0400], \"GET, /shuttle/countdown/, HTTP/1.0\", 200, 3985]  \n",
       "2     [199.120.110.21, -, -, [01/Jul/1995:00:00:09, -0400], \"GET, /shuttle/missions/sts-73/mission-sts-73.html, HTTP/1.0\", 200, 4085]  \n",
       "3                 [burger.letters.com, -, -, [01/Jul/1995:00:00:11, -0400], \"GET, /shuttle/countdown/liftoff.html, HTTP/1.0\", 304, 0]  \n",
       "4  [199.120.110.21, -, -, [01/Jul/1995:00:00:11, -0400], \"GET, /shuttle/missions/sts-73/sts-73-patch-small.gif, HTTP/1.0\", 200, 4179]  \n",
       "5                      [burger.letters.com, -, -, [01/Jul/1995:00:00:12, -0400], \"GET, /images/NASA-logosmall.gif, HTTP/1.0\", 304, 0]  \n",
       "6          [burger.letters.com, -, -, [01/Jul/1995:00:00:12, -0400], \"GET, /shuttle/countdown/video/livevideo.gif, HTTP/1.0\", 200, 0]  \n",
       "7               [205.212.115.106, -, -, [01/Jul/1995:00:00:12, -0400], \"GET, /shuttle/countdown/countdown.html, HTTP/1.0\", 200, 3985]  \n",
       "8                                 [d104.aa.net, -, -, [01/Jul/1995:00:00:13, -0400], \"GET, /shuttle/countdown/, HTTP/1.0\", 200, 3985]  \n",
       "9                                                [129.94.144.152, -, -, [01/Jul/1995:00:00:13, -0400], \"GET, /, HTTP/1.0\", 200, 7074]  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyspark.sql.functions import split\n",
    "dfArrays = dfLog.withColumn(\"tokenized\", split(\"value\",\" \"))\n",
    "dfArrays.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Second attempt, let's build a custom parsing UDF "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import udf\n",
    "\n",
    "@udf\n",
    "def parseUDF(line):\n",
    "    import re\n",
    "    PATTERN = '^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (\\S+)\\s*(\\S*)\" (\\d{3}) (\\S+)'\n",
    "    match = re.search(PATTERN, line)\n",
    "    if match is None:\n",
    "        return (line, 0)\n",
    "    size_field = match.group(9)\n",
    "    if size_field == '-':\n",
    "        size = 0\n",
    "    else:\n",
    "        size = match.group(9)\n",
    "    return {\n",
    "        \"host\"          : match.group(1), \n",
    "        \"client_identd\" : match.group(2), \n",
    "        \"user_id\"       : match.group(3), \n",
    "        \"date_time\"     : match.group(4), \n",
    "        \"method\"        : match.group(5),\n",
    "        \"endpoint\"      : match.group(6),\n",
    "        \"protocol\"      : match.group(7),\n",
    "        \"response_code\" : int(match.group(8)),\n",
    "        \"content_size\"  : size\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>value</th>\n",
       "      <th>parsed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] \"GET /history/apollo/ HTTP/1.0\" 200 6245</td>\n",
       "      <td>{response_code=200, protocol=HTTP/1.0, endpoint=/history/apollo/, content_size=6245, method=GET, date_time=01/Jul/1995:00:00:01 -0400, user_id=-, host=199.72.81.55, client_identd=-}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985</td>\n",
       "      <td>{response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/countdown/, content_size=3985, method=GET, date_time=01/Jul/1995:00:00:06 -0400, user_id=-, host=unicomp6.unicomp.net, client_identd=-}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] \"GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0\" 200 4085</td>\n",
       "      <td>{response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/missions/sts-73/mission-sts-73.html, content_size=4085, method=GET, date_time=01/Jul/1995:00:00:09 -0400, user_id=-, host=199.120.110.21, c...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/countdown/liftoff.html HTTP/1.0\" 304 0</td>\n",
       "      <td>{response_code=304, protocol=HTTP/1.0, endpoint=/shuttle/countdown/liftoff.html, content_size=0, method=GET, date_time=01/Jul/1995:00:00:11 -0400, user_id=-, host=burger.letters.com, client_identd=-}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0\" 200 4179</td>\n",
       "      <td>{response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/missions/sts-73/sts-73-patch-small.gif, content_size=4179, method=GET, date_time=01/Jul/1995:00:00:11 -0400, user_id=-, host=199.120.110.21...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /images/NASA-logosmall.gif HTTP/1.0\" 304 0</td>\n",
       "      <td>{response_code=304, protocol=HTTP/1.0, endpoint=/images/NASA-logosmall.gif, content_size=0, method=GET, date_time=01/Jul/1995:00:00:12 -0400, user_id=-, host=burger.letters.com, client_identd=-}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/video/livevideo.gif HTTP/1.0\" 200 0</td>\n",
       "      <td>{response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/countdown/video/livevideo.gif, content_size=0, method=GET, date_time=01/Jul/1995:00:00:12 -0400, user_id=-, host=burger.letters.com, client...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/countdown.html HTTP/1.0\" 200 3985</td>\n",
       "      <td>{response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/countdown/countdown.html, content_size=3985, method=GET, date_time=01/Jul/1995:00:00:12 -0400, user_id=-, host=205.212.115.106, client_iden...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>d104.aa.net - - [01/Jul/1995:00:00:13 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985</td>\n",
       "      <td>{response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/countdown/, content_size=3985, method=GET, date_time=01/Jul/1995:00:00:13 -0400, user_id=-, host=d104.aa.net, client_identd=-}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] \"GET / HTTP/1.0\" 200 7074</td>\n",
       "      <td>{response_code=200, protocol=HTTP/1.0, endpoint=/, content_size=7074, method=GET, date_time=01/Jul/1995:00:00:13 -0400, user_id=-, host=129.94.144.152, client_identd=-}</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                     value  \\\n",
       "0                                   199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] \"GET /history/apollo/ HTTP/1.0\" 200 6245   \n",
       "1                        unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985   \n",
       "2     199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] \"GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0\" 200 4085   \n",
       "3                 burger.letters.com - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/countdown/liftoff.html HTTP/1.0\" 304 0   \n",
       "4  199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0\" 200 4179   \n",
       "5                      burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /images/NASA-logosmall.gif HTTP/1.0\" 304 0   \n",
       "6          burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/video/livevideo.gif HTTP/1.0\" 200 0   \n",
       "7               205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/countdown.html HTTP/1.0\" 200 3985   \n",
       "8                                 d104.aa.net - - [01/Jul/1995:00:00:13 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985   \n",
       "9                                                129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] \"GET / HTTP/1.0\" 200 7074   \n",
       "\n",
       "                                                                                                                                                                                                    parsed  \n",
       "0                    {response_code=200, protocol=HTTP/1.0, endpoint=/history/apollo/, content_size=6245, method=GET, date_time=01/Jul/1995:00:00:01 -0400, user_id=-, host=199.72.81.55, client_identd=-}  \n",
       "1         {response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/countdown/, content_size=3985, method=GET, date_time=01/Jul/1995:00:00:06 -0400, user_id=-, host=unicomp6.unicomp.net, client_identd=-}  \n",
       "2  {response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/missions/sts-73/mission-sts-73.html, content_size=4085, method=GET, date_time=01/Jul/1995:00:00:09 -0400, user_id=-, host=199.120.110.21, c...  \n",
       "3  {response_code=304, protocol=HTTP/1.0, endpoint=/shuttle/countdown/liftoff.html, content_size=0, method=GET, date_time=01/Jul/1995:00:00:11 -0400, user_id=-, host=burger.letters.com, client_identd=-}  \n",
       "4  {response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/missions/sts-73/sts-73-patch-small.gif, content_size=4179, method=GET, date_time=01/Jul/1995:00:00:11 -0400, user_id=-, host=199.120.110.21...  \n",
       "5       {response_code=304, protocol=HTTP/1.0, endpoint=/images/NASA-logosmall.gif, content_size=0, method=GET, date_time=01/Jul/1995:00:00:12 -0400, user_id=-, host=burger.letters.com, client_identd=-}  \n",
       "6  {response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/countdown/video/livevideo.gif, content_size=0, method=GET, date_time=01/Jul/1995:00:00:12 -0400, user_id=-, host=burger.letters.com, client...  \n",
       "7  {response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/countdown/countdown.html, content_size=3985, method=GET, date_time=01/Jul/1995:00:00:12 -0400, user_id=-, host=205.212.115.106, client_iden...  \n",
       "8                  {response_code=200, protocol=HTTP/1.0, endpoint=/shuttle/countdown/, content_size=3985, method=GET, date_time=01/Jul/1995:00:00:13 -0400, user_id=-, host=d104.aa.net, client_identd=-}  \n",
       "9                                 {response_code=200, protocol=HTTP/1.0, endpoint=/, content_size=7074, method=GET, date_time=01/Jul/1995:00:00:13 -0400, user_id=-, host=129.94.144.152, client_identd=-}  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Let's start from the beginning\n",
    "dfParsed= dfLog.withColumn(\"parsed\", parseUDF(\"value\"))\n",
    "dfParsed.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- value: string (nullable = true)\n",
      " |-- parsed: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dfParsed.printSchema()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Third attempt, let's fix our UDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "#from pyspark.sql.functions import udf # already imported\n",
    "from pyspark.sql.types import MapType, StringType\n",
    "\n",
    "@udf(MapType(StringType(),StringType()))\n",
    "def parseUDFbetter(line):\n",
    "    import re\n",
    "    PATTERN = '^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (\\S+)\\s*(\\S*)\" (\\d{3}) (\\S+)'\n",
    "    match = re.search(PATTERN, line)\n",
    "    if match is None:\n",
    "        return (line, 0)\n",
    "    size_field = match.group(9)\n",
    "    if size_field == '-':\n",
    "        size = 0\n",
    "    else:\n",
    "        size = match.group(9)\n",
    "    return {\n",
    "        \"host\"          : match.group(1), \n",
    "        \"client_identd\" : match.group(2), \n",
    "        \"user_id\"       : match.group(3), \n",
    "        \"date_time\"     : match.group(4), \n",
    "        \"method\"        : match.group(5),\n",
    "        \"endpoint\"      : match.group(6),\n",
    "        \"protocol\"      : match.group(7),\n",
    "        \"response_code\" : int(match.group(8)),\n",
    "        \"content_size\"  : size\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>value</th>\n",
       "      <th>parsed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] \"GET /history/apollo/ HTTP/1.0\" 200 6245</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/history/apollo/', 'content_size': '6245', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:01 -0400', 'user_id': '-', 'host': '199.72...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:06 -0400', 'user_id': '-', 'host': 'uni...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] \"GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0\" 200 4085</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/mission-sts-73.html', 'content_size': '4085', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:09 -0400', 'us...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/countdown/liftoff.html HTTP/1.0\" 304 0</td>\n",
       "      <td>{'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/liftoff.html', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', 'user_id': '-', 'ho...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0\" 200 4179</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/sts-73-patch-small.gif', 'content_size': '4179', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /images/NASA-logosmall.gif HTTP/1.0\" 304 0</td>\n",
       "      <td>{'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/images/NASA-logosmall.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/video/livevideo.gif HTTP/1.0\" 200 0</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/video/livevideo.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/countdown.html HTTP/1.0\" 200 3985</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/countdown.html', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-'...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>d104.aa.net - - [01/Jul/1995:00:00:13 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': 'd10...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] \"GET / HTTP/1.0\" 200 7074</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/', 'content_size': '7074', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': '129.94.144.152', 'cli...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                     value  \\\n",
       "0                                   199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] \"GET /history/apollo/ HTTP/1.0\" 200 6245   \n",
       "1                        unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985   \n",
       "2     199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] \"GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0\" 200 4085   \n",
       "3                 burger.letters.com - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/countdown/liftoff.html HTTP/1.0\" 304 0   \n",
       "4  199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0\" 200 4179   \n",
       "5                      burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /images/NASA-logosmall.gif HTTP/1.0\" 304 0   \n",
       "6          burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/video/livevideo.gif HTTP/1.0\" 200 0   \n",
       "7               205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/countdown.html HTTP/1.0\" 200 3985   \n",
       "8                                 d104.aa.net - - [01/Jul/1995:00:00:13 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985   \n",
       "9                                                129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] \"GET / HTTP/1.0\" 200 7074   \n",
       "\n",
       "                                                                                                                                                                                                    parsed  \n",
       "0  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/history/apollo/', 'content_size': '6245', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:01 -0400', 'user_id': '-', 'host': '199.72...  \n",
       "1  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:06 -0400', 'user_id': '-', 'host': 'uni...  \n",
       "2  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/mission-sts-73.html', 'content_size': '4085', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:09 -0400', 'us...  \n",
       "3  {'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/liftoff.html', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', 'user_id': '-', 'ho...  \n",
       "4  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/sts-73-patch-small.gif', 'content_size': '4179', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', ...  \n",
       "5  {'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/images/NASA-logosmall.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': ...  \n",
       "6  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/video/livevideo.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '...  \n",
       "7  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/countdown.html', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-'...  \n",
       "8  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': 'd10...  \n",
       "9  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/', 'content_size': '7074', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': '129.94.144.152', 'cli...  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Let's start from the beginning\n",
    "dfParsed= dfLog.withColumn(\"parsed\", parseUDFbetter(\"value\"))\n",
    "dfParsed.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>value</th>\n",
       "      <th>parsed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] \"GET /history/apollo/ HTTP/1.0\" 200 6245</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/history/apollo/', 'content_size': '6245', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:01 -0400', 'user_id': '-', 'host': '199.72...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:06 -0400', 'user_id': '-', 'host': 'uni...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] \"GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0\" 200 4085</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/mission-sts-73.html', 'content_size': '4085', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:09 -0400', 'us...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/countdown/liftoff.html HTTP/1.0\" 304 0</td>\n",
       "      <td>{'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/liftoff.html', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', 'user_id': '-', 'ho...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0\" 200 4179</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/sts-73-patch-small.gif', 'content_size': '4179', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /images/NASA-logosmall.gif HTTP/1.0\" 304 0</td>\n",
       "      <td>{'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/images/NASA-logosmall.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/video/livevideo.gif HTTP/1.0\" 200 0</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/video/livevideo.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/countdown.html HTTP/1.0\" 200 3985</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/countdown.html', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-'...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>d104.aa.net - - [01/Jul/1995:00:00:13 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': 'd10...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] \"GET / HTTP/1.0\" 200 7074</td>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/', 'content_size': '7074', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': '129.94.144.152', 'cli...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                     value  \\\n",
       "0                                   199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] \"GET /history/apollo/ HTTP/1.0\" 200 6245   \n",
       "1                        unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985   \n",
       "2     199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] \"GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0\" 200 4085   \n",
       "3                 burger.letters.com - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/countdown/liftoff.html HTTP/1.0\" 304 0   \n",
       "4  199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] \"GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0\" 200 4179   \n",
       "5                      burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /images/NASA-logosmall.gif HTTP/1.0\" 304 0   \n",
       "6          burger.letters.com - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/video/livevideo.gif HTTP/1.0\" 200 0   \n",
       "7               205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] \"GET /shuttle/countdown/countdown.html HTTP/1.0\" 200 3985   \n",
       "8                                 d104.aa.net - - [01/Jul/1995:00:00:13 -0400] \"GET /shuttle/countdown/ HTTP/1.0\" 200 3985   \n",
       "9                                                129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] \"GET / HTTP/1.0\" 200 7074   \n",
       "\n",
       "                                                                                                                                                                                                    parsed  \n",
       "0  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/history/apollo/', 'content_size': '6245', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:01 -0400', 'user_id': '-', 'host': '199.72...  \n",
       "1  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:06 -0400', 'user_id': '-', 'host': 'uni...  \n",
       "2  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/mission-sts-73.html', 'content_size': '4085', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:09 -0400', 'us...  \n",
       "3  {'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/liftoff.html', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', 'user_id': '-', 'ho...  \n",
       "4  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/sts-73-patch-small.gif', 'content_size': '4179', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', ...  \n",
       "5  {'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/images/NASA-logosmall.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': ...  \n",
       "6  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/video/livevideo.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '...  \n",
       "7  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/countdown.html', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-'...  \n",
       "8  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': 'd10...  \n",
       "9  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/', 'content_size': '7074', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': '129.94.144.152', 'cli...  "
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Let's start from the beginning\n",
    "dfParsed= dfLog.withColumn(\"parsed\", parseUDFbetter(\"value\"))\n",
    "dfParsed.limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- value: string (nullable = true)\n",
      " |-- parsed: map (nullable = true)\n",
      " |    |-- key: string\n",
      " |    |-- value: string (valueContainsNull = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#Bingo!! we'got a column of type map with the fields parsed\n",
    "dfParsed.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>parsed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/history/apollo/', 'content_size': '6245', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:01 -0400', 'user_id': '-', 'host': '199.72...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:06 -0400', 'user_id': '-', 'host': 'uni...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/mission-sts-73.html', 'content_size': '4085', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:09 -0400', 'us...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>{'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/liftoff.html', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', 'user_id': '-', 'ho...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/sts-73-patch-small.gif', 'content_size': '4179', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>{'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/images/NASA-logosmall.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/video/livevideo.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/countdown.html', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-'...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': 'd10...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>{'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/', 'content_size': '7074', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': '129.94.144.152', 'cli...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                                                                                                                                    parsed\n",
       "0  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/history/apollo/', 'content_size': '6245', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:01 -0400', 'user_id': '-', 'host': '199.72...\n",
       "1  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:06 -0400', 'user_id': '-', 'host': 'uni...\n",
       "2  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/mission-sts-73.html', 'content_size': '4085', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:09 -0400', 'us...\n",
       "3  {'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/liftoff.html', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', 'user_id': '-', 'ho...\n",
       "4  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/missions/sts-73/sts-73-patch-small.gif', 'content_size': '4179', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:11 -0400', ...\n",
       "5  {'response_code': '304', 'protocol': 'HTTP/1.0', 'endpoint': '/images/NASA-logosmall.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-', 'host': ...\n",
       "6  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/video/livevideo.gif', 'content_size': '0', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '...\n",
       "7  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/countdown.html', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:12 -0400', 'user_id': '-'...\n",
       "8  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/shuttle/countdown/', 'content_size': '3985', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': 'd10...\n",
       "9  {'response_code': '200', 'protocol': 'HTTP/1.0', 'endpoint': '/', 'content_size': '7074', 'method': 'GET', 'date_time': '01/Jul/1995:00:00:13 -0400', 'user_id': '-', 'host': '129.94.144.152', 'cli..."
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfParsed.select(\"parsed\").limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Let's build separate columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|                host|\n",
      "+--------------------+\n",
      "|        199.72.81.55|\n",
      "|unicomp6.unicomp.net|\n",
      "|      199.120.110.21|\n",
      "|  burger.letters.com|\n",
      "|      199.120.110.21|\n",
      "+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dfParsed.selectExpr(\"parsed['host'] as host\").limit(5).show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+\n",
      "|        parsed[host]|   parsed[date_time]|\n",
      "+--------------------+--------------------+\n",
      "|        199.72.81.55|01/Jul/1995:00:00...|\n",
      "|unicomp6.unicomp.net|01/Jul/1995:00:00...|\n",
      "|      199.120.110.21|01/Jul/1995:00:00...|\n",
      "|  burger.letters.com|01/Jul/1995:00:00...|\n",
      "|      199.120.110.21|01/Jul/1995:00:00...|\n",
      "+--------------------+--------------------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "dfParsed.selectExpr([\"parsed['host']\", \"parsed['date_time']\"]).show(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"parsed['host'] as host\",\n",
       " \"parsed['client_identd'] as client_identd\",\n",
       " \"parsed['user_id'] as user_id\",\n",
       " \"parsed['date_time'] as date_time\",\n",
       " \"parsed['method'] as method\",\n",
       " \"parsed['endpoint'] as endpoint\",\n",
       " \"parsed['protocol'] as protocol\",\n",
       " \"parsed['response_code'] as response_code\",\n",
       " \"parsed['content_size'] as content_size\"]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fields = [\"host\", \"client_identd\",\"user_id\", \"date_time\", \"method\", \"endpoint\", \"protocol\", \"response_code\", \"content_size\"]\n",
    "exprs = [ \"parsed['{}'] as {}\".format(field,field) for field in fields]\n",
    "exprs\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>client_identd</th>\n",
       "      <th>user_id</th>\n",
       "      <th>date_time</th>\n",
       "      <th>method</th>\n",
       "      <th>endpoint</th>\n",
       "      <th>protocol</th>\n",
       "      <th>response_code</th>\n",
       "      <th>content_size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>199.72.81.55</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>01/Jul/1995:00:00:01 -0400</td>\n",
       "      <td>GET</td>\n",
       "      <td>/history/apollo/</td>\n",
       "      <td>HTTP/1.0</td>\n",
       "      <td>200</td>\n",
       "      <td>6245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>unicomp6.unicomp.net</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>01/Jul/1995:00:00:06 -0400</td>\n",
       "      <td>GET</td>\n",
       "      <td>/shuttle/countdown/</td>\n",
       "      <td>HTTP/1.0</td>\n",
       "      <td>200</td>\n",
       "      <td>3985</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>199.120.110.21</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>01/Jul/1995:00:00:09 -0400</td>\n",
       "      <td>GET</td>\n",
       "      <td>/shuttle/missions/sts-73/mission-sts-73.html</td>\n",
       "      <td>HTTP/1.0</td>\n",
       "      <td>200</td>\n",
       "      <td>4085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>burger.letters.com</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>01/Jul/1995:00:00:11 -0400</td>\n",
       "      <td>GET</td>\n",
       "      <td>/shuttle/countdown/liftoff.html</td>\n",
       "      <td>HTTP/1.0</td>\n",
       "      <td>304</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>199.120.110.21</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>01/Jul/1995:00:00:11 -0400</td>\n",
       "      <td>GET</td>\n",
       "      <td>/shuttle/missions/sts-73/sts-73-patch-small.gif</td>\n",
       "      <td>HTTP/1.0</td>\n",
       "      <td>200</td>\n",
       "      <td>4179</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   host client_identd user_id                   date_time  \\\n",
       "0          199.72.81.55             -       -  01/Jul/1995:00:00:01 -0400   \n",
       "1  unicomp6.unicomp.net             -       -  01/Jul/1995:00:00:06 -0400   \n",
       "2        199.120.110.21             -       -  01/Jul/1995:00:00:09 -0400   \n",
       "3    burger.letters.com             -       -  01/Jul/1995:00:00:11 -0400   \n",
       "4        199.120.110.21             -       -  01/Jul/1995:00:00:11 -0400   \n",
       "\n",
       "  method                                         endpoint  protocol  \\\n",
       "0    GET                                 /history/apollo/  HTTP/1.0   \n",
       "1    GET                              /shuttle/countdown/  HTTP/1.0   \n",
       "2    GET     /shuttle/missions/sts-73/mission-sts-73.html  HTTP/1.0   \n",
       "3    GET                  /shuttle/countdown/liftoff.html  HTTP/1.0   \n",
       "4    GET  /shuttle/missions/sts-73/sts-73-patch-small.gif  HTTP/1.0   \n",
       "\n",
       "  response_code content_size  \n",
       "0           200         6245  \n",
       "1           200         3985  \n",
       "2           200         4085  \n",
       "3           304            0  \n",
       "4           200         4179  "
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfClean = dfParsed.selectExpr(*exprs)\n",
    "dfClean.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Popular hosts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>piweba3y.prodigy.com</td>\n",
       "      <td>17572</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>piweba4y.prodigy.com</td>\n",
       "      <td>11591</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>piweba1y.prodigy.com</td>\n",
       "      <td>9868</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>alyssa.prodigy.com</td>\n",
       "      <td>7852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>siltb10.orl.mmc.com</td>\n",
       "      <td>7573</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>piweba2y.prodigy.com</td>\n",
       "      <td>5922</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>edams.ksc.nasa.gov</td>\n",
       "      <td>5434</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>163.206.89.4</td>\n",
       "      <td>4906</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>news.ti.com</td>\n",
       "      <td>4863</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>disarray.demon.co.uk</td>\n",
       "      <td>4353</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   host  count\n",
       "0  piweba3y.prodigy.com  17572\n",
       "1  piweba4y.prodigy.com  11591\n",
       "2  piweba1y.prodigy.com   9868\n",
       "3    alyssa.prodigy.com   7852\n",
       "4   siltb10.orl.mmc.com   7573\n",
       "5  piweba2y.prodigy.com   5922\n",
       "6    edams.ksc.nasa.gov   5434\n",
       "7          163.206.89.4   4906\n",
       "8           news.ti.com   4863\n",
       "9  disarray.demon.co.uk   4353"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyspark.sql.functions import desc\n",
    "dfClean.groupBy(\"host\").count().orderBy(desc(\"count\")).limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Popular content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>endpoint</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>/images/NASA-logosmall.gif</td>\n",
       "      <td>111330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>/images/KSC-logosmall.gif</td>\n",
       "      <td>89638</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/images/MOSAIC-logosmall.gif</td>\n",
       "      <td>60467</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>/images/USA-logosmall.gif</td>\n",
       "      <td>60013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>/images/WORLD-logosmall.gif</td>\n",
       "      <td>59488</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>/images/ksclogo-medium.gif</td>\n",
       "      <td>58801</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>/images/launch-logo.gif</td>\n",
       "      <td>40871</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>/shuttle/countdown/</td>\n",
       "      <td>40278</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>/ksc.html</td>\n",
       "      <td>40226</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>/images/ksclogosmall.gif</td>\n",
       "      <td>33585</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       endpoint   count\n",
       "0    /images/NASA-logosmall.gif  111330\n",
       "1     /images/KSC-logosmall.gif   89638\n",
       "2  /images/MOSAIC-logosmall.gif   60467\n",
       "3     /images/USA-logosmall.gif   60013\n",
       "4   /images/WORLD-logosmall.gif   59488\n",
       "5    /images/ksclogo-medium.gif   58801\n",
       "6       /images/launch-logo.gif   40871\n",
       "7           /shuttle/countdown/   40278\n",
       "8                     /ksc.html   40226\n",
       "9      /images/ksclogosmall.gif   33585"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyspark.sql.functions import desc\n",
    "dfClean.groupBy(\"endpoint\").count().orderBy(desc(\"count\")).limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Large Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>endpoint</th>\n",
       "      <th>content_size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>/images/cdrom-1-95/img0007.jpg</td>\n",
       "      <td>99981</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>/shuttle/missions/sts-71/movies/sts-71-launch.mpg</td>\n",
       "      <td>999424</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/shuttle/missions/sts-71/movies/sts-71-launch.mpg</td>\n",
       "      <td>999424</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>/history/apollo/apollo-13/images/index.gif</td>\n",
       "      <td>99942</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>/history/apollo/apollo-13/images/index.gif</td>\n",
       "      <td>99942</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>/history/apollo/apollo-13/images/index.gif</td>\n",
       "      <td>99942</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>/history/apollo/apollo-13/images/index.gif</td>\n",
       "      <td>99942</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>/history/apollo/apollo-13/images/index.gif</td>\n",
       "      <td>99942</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>/history/apollo/apollo-13/images/index.gif</td>\n",
       "      <td>99942</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>/history/apollo/apollo-13/images/index.gif</td>\n",
       "      <td>99942</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            endpoint content_size\n",
       "0                     /images/cdrom-1-95/img0007.jpg        99981\n",
       "1  /shuttle/missions/sts-71/movies/sts-71-launch.mpg       999424\n",
       "2  /shuttle/missions/sts-71/movies/sts-71-launch.mpg       999424\n",
       "3         /history/apollo/apollo-13/images/index.gif        99942\n",
       "4         /history/apollo/apollo-13/images/index.gif        99942\n",
       "5         /history/apollo/apollo-13/images/index.gif        99942\n",
       "6         /history/apollo/apollo-13/images/index.gif        99942\n",
       "7         /history/apollo/apollo-13/images/index.gif        99942\n",
       "8         /history/apollo/apollo-13/images/index.gif        99942\n",
       "9         /history/apollo/apollo-13/images/index.gif        99942"
      ]
     },
     "execution_count": 187,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfClean.createOrReplaceTempView(\"cleanlog\")\n",
    "spark.sql(\"\"\"\n",
    "select endpoint, content_size\n",
    "from cleanlog \n",
    "order by content_size desc\n",
    "\"\"\").limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>host</th>\n",
       "      <th>client_identd</th>\n",
       "      <th>user_id</th>\n",
       "      <th>date_time</th>\n",
       "      <th>method</th>\n",
       "      <th>endpoint</th>\n",
       "      <th>protocol</th>\n",
       "      <th>response_code</th>\n",
       "      <th>content_size</th>\n",
       "      <th>content_size_bytes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>199.72.81.55</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>01/Jul/1995:00:00:01 -0400</td>\n",
       "      <td>GET</td>\n",
       "      <td>/history/apollo/</td>\n",
       "      <td>HTTP/1.0</td>\n",
       "      <td>200</td>\n",
       "      <td>6245</td>\n",
       "      <td>6245</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>unicomp6.unicomp.net</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>01/Jul/1995:00:00:06 -0400</td>\n",
       "      <td>GET</td>\n",
       "      <td>/shuttle/countdown/</td>\n",
       "      <td>HTTP/1.0</td>\n",
       "      <td>200</td>\n",
       "      <td>3985</td>\n",
       "      <td>3985</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>199.120.110.21</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>01/Jul/1995:00:00:09 -0400</td>\n",
       "      <td>GET</td>\n",
       "      <td>/shuttle/missions/sts-73/mission-sts-73.html</td>\n",
       "      <td>HTTP/1.0</td>\n",
       "      <td>200</td>\n",
       "      <td>4085</td>\n",
       "      <td>4085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>burger.letters.com</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>01/Jul/1995:00:00:11 -0400</td>\n",
       "      <td>GET</td>\n",
       "      <td>/shuttle/countdown/liftoff.html</td>\n",
       "      <td>HTTP/1.0</td>\n",
       "      <td>304</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>199.120.110.21</td>\n",
       "      <td>-</td>\n",
       "      <td>-</td>\n",
       "      <td>01/Jul/1995:00:00:11 -0400</td>\n",
       "      <td>GET</td>\n",
       "      <td>/shuttle/missions/sts-73/sts-73-patch-small.gif</td>\n",
       "      <td>HTTP/1.0</td>\n",
       "      <td>200</td>\n",
       "      <td>4179</td>\n",
       "      <td>4179</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   host client_identd user_id                   date_time  \\\n",
       "0          199.72.81.55             -       -  01/Jul/1995:00:00:01 -0400   \n",
       "1  unicomp6.unicomp.net             -       -  01/Jul/1995:00:00:06 -0400   \n",
       "2        199.120.110.21             -       -  01/Jul/1995:00:00:09 -0400   \n",
       "3    burger.letters.com             -       -  01/Jul/1995:00:00:11 -0400   \n",
       "4        199.120.110.21             -       -  01/Jul/1995:00:00:11 -0400   \n",
       "\n",
       "  method                                         endpoint  protocol  \\\n",
       "0    GET                                 /history/apollo/  HTTP/1.0   \n",
       "1    GET                              /shuttle/countdown/  HTTP/1.0   \n",
       "2    GET     /shuttle/missions/sts-73/mission-sts-73.html  HTTP/1.0   \n",
       "3    GET                  /shuttle/countdown/liftoff.html  HTTP/1.0   \n",
       "4    GET  /shuttle/missions/sts-73/sts-73-patch-small.gif  HTTP/1.0   \n",
       "\n",
       "  response_code content_size  content_size_bytes  \n",
       "0           200         6245                6245  \n",
       "1           200         3985                3985  \n",
       "2           200         4085                4085  \n",
       "3           304            0                   0  \n",
       "4           200         4179                4179  "
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyspark.sql.functions import expr\n",
    "dfCleanTyped = dfClean.withColumn(\"content_size_bytes\", expr(\"cast(content_size  as int)\"))\n",
    "dfCleanTyped.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>endpoint</th>\n",
       "      <th>content_size</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>/shuttle/countdown/video/livevideo.jpeg</td>\n",
       "      <td>6823936</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>/statistics/1995/bkup/Mar95_full.html</td>\n",
       "      <td>3155499</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/statistics/1995/bkup/Mar95_full.html</td>\n",
       "      <td>3155499</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>/statistics/1995/bkup/Mar95_full.html</td>\n",
       "      <td>3155499</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>/statistics/1995/bkup/Mar95_full.html</td>\n",
       "      <td>3155499</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>/statistics/1995/bkup/Mar95_full.html</td>\n",
       "      <td>3155499</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>/statistics/1995/bkup/Mar95_full.html</td>\n",
       "      <td>3155499</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>/statistics/1995/bkup/Mar95_full.html</td>\n",
       "      <td>3155499</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>/statistics/1995/Jun/Jun95_reverse_domains.html</td>\n",
       "      <td>2973350</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>/statistics/1995/Jun/Jun95_reverse_domains.html</td>\n",
       "      <td>2973350</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                          endpoint content_size\n",
       "0          /shuttle/countdown/video/livevideo.jpeg      6823936\n",
       "1            /statistics/1995/bkup/Mar95_full.html      3155499\n",
       "2            /statistics/1995/bkup/Mar95_full.html      3155499\n",
       "3            /statistics/1995/bkup/Mar95_full.html      3155499\n",
       "4            /statistics/1995/bkup/Mar95_full.html      3155499\n",
       "5            /statistics/1995/bkup/Mar95_full.html      3155499\n",
       "6            /statistics/1995/bkup/Mar95_full.html      3155499\n",
       "7            /statistics/1995/bkup/Mar95_full.html      3155499\n",
       "8  /statistics/1995/Jun/Jun95_reverse_domains.html      2973350\n",
       "9  /statistics/1995/Jun/Jun95_reverse_domains.html      2973350"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfCleanTyped.createOrReplaceTempView(\"cleantypedlog\")\n",
    "spark.sql(\"\"\"\n",
    "select endpoint, content_size\n",
    "from cleantypedlog \n",
    "order by content_size_bytes desc\n",
    "\"\"\").limit(10).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Left for you, clean the date column :)\n",
    "# 1- Create a udf that parses that weird format,\n",
    "# 2- Create a new column with a data tiem string that spark would understand\n",
    "# 3- Add a new date-time column properly typed\n",
    "# 4- Print your schema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
